gffread /data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gff3 -T -o /data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gtf
# or
singularity shell -B "${PWD}":/data /data/softwares/agat/agat_1.5.0--pl5321hdfd78af_0.sif
agat_convert_sp_gff2gtf.pl --gtf_version 3 --gff Petaurus_breviceps.v1.v3.KIZ_NCBI.basic.chr.annotation.gff3 -o Petaurus_breviceps.v1.v3.KIZ_NCBI.basic.chr.annotation.gtf
# sort GTF
# copy /usr/local/lib/perl5/site_perl/auto/share/dist/AGAT/agat_config.yaml to your working directory
# set 'output_format: GTF' and 'gtf_output_version: 3'
agat_convert_sp_gxf2gxf.pl -g Neovision_vision.v1.v2.KIZ_NCBI.basic.chr.annotation.gtf -o Neovision_vision.v1.v2.KIZ_NCBI.basic.chr.annotation.sorted.gtf -c agat_config.yaml1 Introduction
For more details, refer to ENCODE RNA-Seq pipeline.
2 Prepare necessary files
2.1 Convert GFF to GTF
2.2 Merge annotations if needed
2.2.1 Prepare input JSON file
{
"merge_anno.annotation": "gencode.v29.primary_assembly.annotation_UCSC_names.gtf.gz",
"merge_anno.tRNA": "gencode.v29.tRNAs.gtf.gz",
"merge_anno.spikeins": "ERCC_phiX.fa.gz",
"merge_anno.output_filename": "merged_annotation_V29.gtf.gz"
}2.2.2 Merge annotations
caper run /data/softwares/encode_pipeline/rna-seq-pipeline_v1.2.4/make_index_wdl/merge_anno.wdl -c /data/softwares/encode_pipeline/caper/local.conf -i merge_anno_input.json --max-concurrent-tasks 2 --singularity3 Build STAR index
3.1 Prepare input JSON file
{
"build_index.reference_sequence": "/data/biodatabase/species/sugar_glider_slb_v1/genome/genome/sugarglider.fasta.gz",
"build_index.spikeins": "/data/biodatabase/species/sugar_glider_slb_v1/genome/genome/ERCC_phix.spikeins.fasta.gz",
"build_index.annotation": "/data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gtf.gz",
"build_index.anno_version": "v1",
"build_index.genome": "sugarglider",
"build_index.index_type": "prep_star",
"build_index.ncpu": 60,
"build_index.memGB": 512
}3.2 Build STAR index
caper run /data/softwares/encode_pipeline/rna-seq-pipeline_v1.2.4/make_index_wdl/build_genome_index.wdl -c /data/softwares/encode_pipeline/caper/local.conf -i /data/biodatabase/species/sugar_glider_slb_v1/encode_references/bulk_rna_seq/star_index_input.json --max-concurrent-tasks 1 --singularity4 Build RSEM index
4.1 Prepare input JSON file
{
"build_index.reference_sequence": "/data/biodatabase/species/sugar_glider_slb_v1/genome/genome/sugarglider.fasta.gz",
"build_index.spikeins": "/data/biodatabase/species/sugar_glider_slb_v1/genome/genome/ERCC_phix.spikeins.fasta.gz",
"build_index.annotation": "/data/biodatabase/species/sugar_glider_slb_v1/genome/anno/sugarglider.gtf.gz",
"build_index.anno_version": "v1",
"build_index.genome": "sugarglider",
"build_index.index_type": "prep_rsem",
"build_index.ncpu": 60,
"build_index.memGB": 512
}4.2 Build RSEM index
caper run /data/softwares/encode_pipeline/rna-seq-pipeline_v1.2.4/make_index_wdl/build_genome_index.wdl -c /data/softwares/encode_pipeline/caper/local.conf -i /data/biodatabase/species/sugar_glider_slb_v1/encode_references/bulk_rna_seq/rsem_index_input.json --max-concurrent-tasks 1 --singularity5 Prepare other files
In most cases, you need to modify the following code to suit yourself.
5.1 Prepare transcript IDs to gene biotypes mapping table
library(rtracklayer)
library(tidyverse)
library(vroom)
# GTF only
# mandatory field: transcript_id
gtf_file <- "/data/database/annotation/release/petaurus_breviceps/Petaurus_breviceps.v1.v1.NCBI_Pipeline.basic.chr.annotation.gtf.gz"
type_field <- "type"
transcript_type_values <- c("transcript")
gene_type_values <- c("gene")
# with higher precedence than transcript biotype field
gene_biotype_field <- "gene_biotype"
# all transcripts lacking gene biotype field
# or transcript biotype field will be assigned this value
gene_biotype_default <- "protein_coding"
# if there is no gene biotype field,
# and transcript biotype field exists,
# the transcript biotype field will be used
transcript_biotype_field <- "transcript_biotype"
df <- as.data.frame(rtracklayer::import(gtf_file, format = "gtf"))
if (any(gene_type_values %in% unique(df[[type_field]])) && gene_biotype_field %in% names(df)) {
message("gene biotypes will be used")
gene_df <- filter(df, .data[[type_field]] %in% gene_type_values) %>%
select(all_of(c("gene_id", gene_biotype_field))) %>%
na.omit() %>%
distinct()
transcript_df <- filter(df, .data[[type_field]] %in% transcript_type_values) %>%
select(all_of(c("transcript_id", "gene_id"))) %>%
na.omit() %>%
distinct()
transcript_df <- left_join(transcript_df, gene_df, by = "gene_id") %>%
select(-gene_id) %>%
distinct()
transcript_df$gene_biotype[is.na(transcript_df$gene_biotype)] <- gene_biotype_default
} else if (transcript_biotype_field %in% names(df)) {
message("transcript biotypes will be used")
transcript_df <- filter(df, .data[[type_field]] %in% transcript_type_values) %>%
select(all_of(c("transcript_id", transcript_biotype_field))) %>%
na.omit() %>%
distinct()
} else {
message("the default gene biotype will be used for all transcripts")
transcript_df <- filter(df, .data[[type_field]] %in% transcript_type_values) %>%
select(transcript_id) %>%
mutate(gene_biotype = gene_biotype_default) %>%
na.omit() %>%
distinct()
}
if (nrow(transcript_df) == 0) {
stop("no transcripts found after filtering")
} else {
vroom_write(
transcript_df,
file = gsub("(?i)(gff(3)?|gtf)(\\.gz)?$",
"transcript_id_to_gene_type.tsv",
gtf_file,
perl = TRUE
),
delim = "\t",
col_names = FALSE,
append = FALSE
)
}5.2 Prepare gene/transcript IDs to names mapping table
library(YRUtils)
library(vroom)
gff_file <- "/data/database/annotation/release/petaurus_breviceps/Petaurus_breviceps.v1.v1.NCBI_Pipeline.basic.chr.annotation.gtf.gz"
target_type <- "transcript"
df <- extract_gene_id_from_gff(gff_file, target_type = target_type)
if (nrow(df) == 0) {
stop("no valid items found")
} else {
vroom_write(
df,
file = gsub("(?i)(gff(3)?|gtf)(\\.gz)?$",
paste0(target_type, "_id_name_mapping_table.tsv"),
gff_file,
perl = TRUE
),
col_names = TRUE, append = FALSE
)
}